# Sarima
suppressMessages(library(astsa))
suppressMessages(library(sarima))
# Arima forecast
suppressMessages(library(forecast))
# Skewness Kurtosis
suppressMessages(library(moments))
# Seasonality test
suppressMessages(library(seastests))
# Plotly
suppressMessages(library(plotly))
# Anomalies
suppressMessages(library(anomalize))
# Time series Manipulation
suppressMessages(library(xts))
# Plots
suppressMessages(library(ggplot2))
# A set of packages
suppressMessages(library(tidyverse))
# Converting everything to everything
suppressMessages(library(tsbox))
# 'ggpubr' provides easy-to-use functions for creating and customizing 'ggplot2'- based publication ready plots
suppressMessages(library(ggpubr))
# This package has functions that plot decomposed time series data with ggplot2
suppressMessages(library(ggplottimeseries))
# Publication ready theme
theme_set(theme_pubr())
# tables
suppressMessages(library(knitr))
suppressMessages(library(kableExtra))
# Missing data mapping
suppressMessages(library(naniar))
suppressMessages(library(pander))
Abstract
The aim of this project was to construct a SARIMA time series model based on the time series of industrial production of Austria obtained from http://datamarket.com web site. After exploratory data analysis appropriate transformations, five possible models have been estimated. The best model was selected by using AIC criterion and diagnostic checking. Once SARIMA (2,1,0) (0,1,1) model was constructed and fitted it was utilized for forecasting and simulation.
Data
#CSV Data -Data import from CSV file
data_csv <- read.csv('austria-qu.csv')
# Missing data mapping
vis_miss(data_csv)

#Time Series Data
data_ts<-ts(data_csv[,2],start = c(1975,4),frequency = 4)
# Conversion to tibble
data_tibble<-ts_tibbletime(data_ts)
## Loading required namespace: tibbletime
#Extend Tibble Data
data_tibble$month <- factor(format(data_tibble$time, "%b"))
data_tibble$year <- factor(format(data_tibble$time, "%Y"))
qOrder=c('Q2','Q1','Q3','Q4')
data_tibble$quarter<-factor(data_tibble$month,labels=qOrder)
pander(data_ts)
| 1975 |
NA |
NA |
NA |
54.1 |
| 1976 |
59.5 |
56.5 |
63.9 |
57.8 |
| 1977 |
62 |
58.5 |
65 |
59.6 |
| 1978 |
63.6 |
60.4 |
66.3 |
60.6 |
| 1979 |
66.8 |
63.2 |
71 |
66.5 |
| 1980 |
72 |
67.8 |
75.6 |
69.2 |
| 1981 |
74.1 |
70.7 |
77.8 |
72.3 |
| 1982 |
78.1 |
72.4 |
82.6 |
72.9 |
| 1983 |
79.5 |
72.6 |
82.8 |
76 |
| 1984 |
85.1 |
80.5 |
89.1 |
84.8 |
| 1985 |
94.2 |
89.5 |
99.3 |
93.1 |
| 1986 |
103.5 |
96.4 |
107.2 |
101.7 |
| 1987 |
109.5 |
101.3 |
112.6 |
105.5 |
| 1988 |
115.4 |
108 |
129.9 |
112.4 |
| 1989 |
123.6 |
114.9 |
131 |
122.6 |
| 1990 |
131.9 |
120.5 |
130.7 |
115.7 |
| 1991 |
119.7 |
109.7 |
125.1 |
NA |
pander(head(data_tibble))
| 1975-10-01 |
54.1 |
Oct |
1975 |
Q4 |
| 1976-01-01 |
59.5 |
Jan |
1976 |
Q1 |
| 1976-04-01 |
56.5 |
Apr |
1976 |
Q2 |
| 1976-07-01 |
63.9 |
Jul |
1976 |
Q3 |
| 1976-10-01 |
57.8 |
Oct |
1976 |
Q4 |
| 1977-01-01 |
62 |
Jan |
1977 |
Q1 |
Exploratory Data Analysis
Observations and Trend
trend <- ma(data_ts, order = 4, centre = T)
trend_ts<- as.ts(trend)
trend_tibble<-ts_tibbletime(trend_ts)
p1<-ggplot(data=data_tibble,mapping=aes(x=time,y=value))+
geom_line(color='darkblue',size=1)+
theme_gray()+
labs(title = 'Time Series', x='Time', y='Observed Values')
fig1<-ggplotly(p1)
fig1
p2<-ggplot(data=trend_tibble,mapping=aes(x=time,y=value))+
geom_line(color='darkblue', size=1)+
theme_gray()+
labs(title = 'Trend', x='Time', y='Trend')
fig2<-ggplotly(p2)
fig2
Anomalize
x<-as.yearqtr(data_csv$Year)
x<-as.Date(x)
y<-data_csv$Production
tbl<-tibble(x,y)
p1 <- tbl %>%
time_decompose(y) %>%
anomalize(remainder, alpha = 0.05, max_anoms = 0.2) %>%
plot_anomaly_decomposition() +
ggtitle("Anomaly Decomposition")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = x
## frequency = 4 quarters
## trend = 17 quarters
p2 <- tbl %>%
time_decompose(y) %>%
anomalize(remainder, alpha = 0.05, max_anoms = 0.2) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE) +
geom_line(color='darkblue')+
ggtitle("Anomalies")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = x
## frequency = 4 quarters
## trend = 17 quarters
p3<-tbl %>%
time_decompose(y) %>%
anomalize(remainder,alpha = 0.05, max_anoms = 0.2) %>%
time_recompose() %>%
filter(anomaly == 'Yes')
## Converting from tbl_df to tbl_time.
## Auto-index message: index = x
## frequency = 4 quarters
## trend = 17 quarters
p1

p2

p4<-tibble(x=p3$x,
observerd=p3$observed,
seson=p3$season,
trend=p3$trend,
remainder=p3$remainder,
anomaly=p3$anomaly)
p4%>%pander()
| 1988-07-01 |
129.9 |
4.127 |
114.2 |
11.55 |
Yes |
| 1989-07-01 |
131 |
4.127 |
117.3 |
9.524 |
Yes |
| 1989-10-01 |
122.6 |
-3.214 |
117.5 |
8.331 |
Yes |
| 1990-01-01 |
131.9 |
2.459 |
117.9 |
11.58 |
Yes |
| 1990-07-01 |
130.7 |
4.127 |
118.6 |
7.956 |
Yes |
Seasonality
The WO-test gives out a TRUE if the series is seasonal and FALSE otherwise.
summary(wo(data_ts))
## Test used: WO
##
## Test statistic: 1
## P-value: 0 0.01518954 5.843575e-05
##
## The WO - test identifies seasonality
# This function converts time series-class data into a data frame of decomposed time series.
df <- dts2(data_ts, type ="additive")
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# plots decomposed time series into one figure
p<-ggdecompose(df)+
theme_gray()+
xlab("Time")+
ylab("Index")
p
## Warning: Removed 2 row(s) containing missing values (geom_path).

p<-ggseasonplot(data_ts) +
theme_gray()+
ylab("Index") +
ggtitle("Seasonal Plot")
p

p<-ggsubseriesplot(data_ts) +
theme_gray()+
ylab("Index") +
ggtitle("Seasonal subseries plot")
p

Box Plot
p<-ggplot(data_tibble,aes(x=factor(quarter,levels = c('Q1','Q2','Q3','Q4'),ordered=TRUE),y=value))+
geom_boxplot()+
theme_gray()+
labs(title = 'BoxPlot', x='Season', y='Index')
fig<-ggplotly(p)
fig
# To obtain stats values and plot non-ggplot box-plot
z<-boxplot(data_ts~cycle(data_ts),xlab='Quarter',col='beige',
ylab='Industrial Production Index', ylim=c(50,140), axes=FALSE)
headsv<-c('Q1','Q2','Q3','Q4')
headsh<-c('Minimum','25th Percentile','Median','75th Percentile','Maximum')
zdf<-data.frame(z$stats)
rownames(zdf)<-headsh
colnames(zdf)<-headsv
zdf%>%
kable%>%
kable_styling(bootstrap_options = c("responsive"), full_width = F, position = "left")%>%
column_spec(1:1, bold = T)
|
|
Q1
|
Q2
|
Q3
|
Q4
|
|
Minimum
|
59.50
|
56.50
|
63.90
|
54.10
|
|
25th Percentile
|
69.40
|
65.50
|
73.30
|
63.55
|
|
Median
|
82.30
|
76.55
|
85.95
|
74.45
|
|
75th Percentile
|
112.45
|
104.65
|
118.85
|
103.60
|
|
Maximum
|
131.90
|
120.50
|
131.00
|
122.60
|
Fitting
Based on Summary of rules for identifying ARIMA models https://people.duke.edu/~rnau/arimrule.htm
fit1<-Arima(data_ts,order=c(0,1,0), seasonal=c(0,1,1),lambda=lmbd)
fit2<-Arima(data_ts,order=c(1,1,0), seasonal=c(0,1,1),lambda=lmbd)
fit3<-Arima(data_ts,order=c(0,1,1), seasonal=c(0,1,1),lambda=lmbd)
fit4<-Arima(data_ts,order=c(2,1,0), seasonal=c(0,1,1),lambda=lmbd)
fit5<-Arima(data_ts,order=c(1,1,1), seasonal=c(0,1,1),lambda=lmbd)
AIC Criteria - Model Selection
aics<-c(fit1$aic,fit2$aic,fit3$aic,fit4$aic,fit5$aic)
aics
## [1] -610.7737 -611.3176 -610.4027 -613.8653 -611.6431
names<-c('fit1','fit2','fit3','fit4','fit5')
aicset<-data.frame(names,aics)
oset<-aicset[order(aics),]
kable(oset)%>%
kable_styling(bootstrap_options = c("responsive"), full_width = F, position='left') %>%
row_spec(1:1, bold = T, color = "white", background = "darkblue")
|
|
names
|
aics
|
|
4
|
fit4
|
-613.8653
|
|
5
|
fit5
|
-611.6431
|
|
2
|
fit2
|
-611.3176
|
|
1
|
fit1
|
-610.7737
|
|
3
|
fit3
|
-610.4027
|
on<-which(aics == min(aics))
cat('Minimum AIC =',aics[on],' fit',on)
## Minimum AIC = -613.8653 fit 4
Auto ARIMA
Auto Arima suggested model
auto.arima(data_ts, d = 1, D = 1, max.p = 2, max.q = 2, max.P = 2,
max.Q = 2, max.order = 5, seasonal = TRUE, lambda=lmbd)
## Series: data_ts
## ARIMA(2,1,0)(0,1,1)[4]
## Box Cox transformation: lambda= -0.65
##
## Coefficients:
## ar1 ar2 sma1
## -0.1529 0.2860 -0.7189
## s.e. 0.1276 0.1318 0.1227
##
## sigma^2 estimated as 1.736e-06: log likelihood=310.93
## AIC=-613.87 AICc=-613.12 BIC=-605.56
Confirmatiory Data Analysis
ACF and PACF Residuals
resd<-fit4$residuals
p <- ggAcf(resd)+
theme_gray()+
labs(title = 'Residuals ACF')
fig<-ggplotly(p)
fig
p <- ggPacf(resd)+
theme_gray()+
labs(title = 'Residuals PACF')
fig<-ggplotly(p)
fig
Residuals - Histogram
rdf<-data.frame(resd)
p <- ggplot(rdf, aes(x=resd)) +
geom_histogram(aes(y = ..density..), fill = "lightgrey", color="darkblue", size=.2,binwidth=.001) +
geom_density(color = "red") +
geom_vline(aes(xintercept=mean(resd, na.rm=T)),
color="red", linetype="dashed", size=.5)+
theme_gray()+
labs(x="")+
labs(y="Density")+
ggtitle("Density with Histogram Overlay")
fig <- ggplotly(p)
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
fig
Q-Q plot, Shapiro test, skewness and kurtosis
p<-ggplot(rdf, aes(sample=resd))+
stat_qq(color='red', shape=1)+
stat_qq_line(color='darkblue')+
theme_gray()+
ggtitle("QQ Plot")
ggplotly(p)
shapiro.test(resd)
##
## Shapiro-Wilk normality test
##
## data: resd
## W = 0.9942, p-value = 0.9915
sk<-skewness(resd)
kt<-kurtosis(resd)
# sk
# kt
skdf<-data.frame(Parameter=c('Skewnes','Kurtosis'),
Value=c(sk,kt),
Ideal=c(0,3),
Diff=c(0-sk,3-kt))
kable(skdf)%>%
kable_styling(bootstrap_options = c("responsive"), full_width = F, position = "left")
|
Parameter
|
Value
|
Ideal
|
Diff
|
|
Skewnes
|
0.0698136
|
0
|
-0.0698136
|
|
Kurtosis
|
2.6216425
|
3
|
0.3783575
|
Original and Fitted Series
tibble_original<-ts_tibbletime(fit4$x)
tibble_fitted<-ts_tibbletime(fit4$fitted)
tibble_2<-tibble_original
tibble_2$fitted<-tibble_fitted$value
tibble_2$original<-tibble_original$value
tibble_2<-gather(tibble_2,series,val,original:fitted)
print(tibble_2)
## # A time tibble: 128 x 4
## # Index: time
## time value series val
## <date> <dbl> <chr> <dbl>
## 1 1975-10-01 54.1 original 54.1
## 2 1976-01-01 59.5 original 59.5
## 3 1976-04-01 56.5 original 56.5
## 4 1976-07-01 63.9 original 63.9
## 5 1976-10-01 57.8 original 57.8
## 6 1977-01-01 62 original 62
## 7 1977-04-01 58.5 original 58.5
## 8 1977-07-01 65 original 65
## 9 1977-10-01 59.6 original 59.6
## 10 1978-01-01 63.6 original 63.6
## # ... with 118 more rows
p<-ggplot(tibble_2,aes(x=time,y=val,color=series))+
geom_point()+
geom_line()+
theme(legend.position = "top")+
theme_gray()+
scale_color_manual(values=c("darkblue", "red"))+
ggtitle("Original and Fitted Series")
fig <- ggplotly(p)
fig
SARIMA Model Summary
astsa::sarima(trdata, 2, 1, 0, P = 0, D = 1, Q = 1, S = 4,
details = TRUE, Model=TRUE)
## initial value -6.495164
## iter 2 value -6.637605
## iter 3 value -6.660639
## iter 4 value -6.676935
## iter 5 value -6.684088
## iter 6 value -6.688198
## iter 7 value -6.690198
## iter 8 value -6.690400
## iter 9 value -6.690404
## iter 10 value -6.690407
## iter 10 value -6.690407
## iter 10 value -6.690407
## final value -6.690407
## converged
## initial value -6.687495
## iter 2 value -6.688901
## iter 3 value -6.688982
## iter 4 value -6.688983
## iter 4 value -6.688983
## iter 4 value -6.688983
## final value -6.688983
## converged

## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, transform.pars = trans, fixed = fixed,
## optim.control = list(trace = trc, REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 sma1
## -0.1529 0.2860 -0.7189
## s.e. 0.1276 0.1318 0.1227
##
## sigma^2 estimated as 1.476e-06: log likelihood = 310.93, aic = -613.87
##
## $degrees_of_freedom
## [1] 56
##
## $ttable
## Estimate SE t.value p.value
## ar1 -0.1529 0.1276 -1.1990 0.2356
## ar2 0.2860 0.1318 2.1689 0.0343
## sma1 -0.7189 0.1227 -5.8597 0.0000
##
## $AIC
## [1] -9.901052
##
## $AICc
## [1] -9.894378
##
## $BIC
## [1] -9.767018
Forecast
fc<-forecast(fit4,12)
fcdf<-as.data.frame(fc)
kable(fcdf)%>%
kable_styling(bootstrap_options = c("responsive"), full_width = F, position='left')
|
|
Point Forecast
|
Lo 80
|
Hi 80
|
Lo 95
|
Hi 95
|
|
1991 Q4
|
113.0014
|
108.99967
|
117.2514
|
106.97500
|
119.6091
|
|
1992 Q1
|
122.3868
|
116.47060
|
128.8158
|
113.52713
|
132.4482
|
|
1992 Q2
|
112.6067
|
105.80155
|
120.1655
|
102.46910
|
124.5125
|
|
1992 Q3
|
127.9640
|
118.25563
|
139.0614
|
113.59775
|
145.5954
|
|
1992 Q4
|
115.6914
|
105.71294
|
127.3252
|
100.99155
|
134.2887
|
|
1993 Q1
|
125.2040
|
112.44497
|
140.5413
|
106.53441
|
149.9614
|
|
1993 Q2
|
115.1707
|
102.70919
|
130.3395
|
96.98583
|
139.7568
|
|
1993 Q3
|
131.0358
|
114.42557
|
152.0369
|
106.98941
|
165.5120
|
|
1993 Q4
|
118.3352
|
102.71239
|
138.3068
|
95.76931
|
151.2464
|
|
1994 Q1
|
128.1835
|
109.07697
|
153.5174
|
100.78650
|
170.4761
|
|
1994 Q2
|
117.7817
|
99.81706
|
141.7832
|
92.06032
|
157.9612
|
|
1994 Q3
|
134.2575
|
110.89694
|
167.0153
|
101.11611
|
190.1021
|
t1<-seq(as.Date("1975/10/1"), as.Date("1991/07/1"), by = "quarter")
t2<-seq(as.Date("1991/10/1"), as.Date("1994/07/1"), by = "quarter")
df1<-data.frame(time = t1, value = data_csv$Production, series = "observations")
df2<-data.frame(time = t2, value = fcdf$`Point Forecast`, series = "forecast")
df3<-data.frame(time = t2, value = fcdf$`Lo 80`, series = "lo80")
df4<-data.frame(time = t2, value = fcdf$`Hi 80`, series = "hi80")
df5<-data.frame(time = t2, value = fcdf$`Lo 95`, series = "lo95")
df6<-data.frame(time = t2, value = fcdf$`Hi 95`, series = "hi95")
p<-ggplot() +
geom_line(data=df1, mapping=aes(x = time, y = value,color=series), size=.4) +
geom_line(data=df2, mapping=aes(x = time, y = value,color=series), size=.4)+
geom_point(data=df2, mapping=aes(x = time, y = value),color='red', size=.8)+
geom_ribbon(aes(ymin=df5$value, ymax=df6$value, x=df2$time),fill = "grey50", alpha = 0.2)+
geom_ribbon(aes(ymin=df3$value, ymax=df4$value, x=df2$time),fill = "grey70", alpha = 0.2)+
scale_color_manual(values=c("red", "darkblue"))+
theme_gray()+
labs(x="Time")+
labs(y="Observationsand Forecast")+
ggtitle("Forecast")
fig<-ggplotly(p)
fig
Time Series Cross-Validation
Using tsCV function we measure the predictive performance of all five models
far1<-function(x, h){forecast(Arima(data_ts,order=c(0,1,0), seasonal=c(0,1,1),lambda=lmbd), h)}
far2<-function(x, h){forecast(Arima(data_ts,order=c(1,1,0), seasonal=c(0,1,1),lambda=lmbd), h)}
far3<-function(x, h){forecast(Arima(data_ts,order=c(0,1,1), seasonal=c(0,1,1),lambda=lmbd), h)}
far4<-function(x, h){forecast(Arima(data_ts,order=c(2,1,0), seasonal=c(0,1,1),lambda=lmbd), h)}
far5<-function(x, h){forecast(Arima(data_ts,order=c(1,1,1), seasonal=c(0,1,1),lambda=lmbd), h)}
e1<-tsCV (data_ts, far1, h = 12)
avg_e1<-mean(e1^2,na.rm=TRUE)
e2<-tsCV (data_ts, far2, h = 12)
avg_e2<-mean(e2^2,na.rm=TRUE)
e3<-tsCV (data_ts, far3, h = 12)
avg_e3<-mean(e3^2,na.rm=TRUE)
e4<-tsCV (data_ts, far4, h = 12)
avg_e4<-mean(e4^2,na.rm=TRUE)
e5<-tsCV (data_ts, far5, h = 12)
avg_e5<-mean(e5^2,na.rm=TRUE)
errors<-c(avg_e1,avg_e2,avg_e3,avg_e4,avg_e5)
names<-c('MSE 1','MSE 2','MSE 3','MSE 4','MSE 5')
errorset<-data.frame(names,errors)
oerrorset<-errorset[order(errors),]
kable(oerrorset)%>%
kable_styling(bootstrap_options = c("responsive"), full_width = F, position='left') %>%
row_spec(1:1, bold = T, color = "white", background = "darkblue")
|
|
names
|
errors
|
|
2
|
MSE 2
|
1359.492
|
|
3
|
MSE 3
|
1359.795
|
|
1
|
MSE 1
|
1375.393
|
|
4
|
MSE 4
|
1472.709
|
|
5
|
MSE 5
|
1489.297
|
min.mse<-which(errors==min(errors))
cat('Minimum MSE =',errors[min.mse],' fit ',min.mse)
## Minimum MSE = 1359.492 fit 2
Min MSE Model Summary
astsa::sarima(trdata, 1, 1, 0, P = 0, D = 1, Q = 1, S = 4,
details = TRUE, Model=TRUE)
## initial value -6.503379
## iter 2 value -6.623008
## iter 3 value -6.627969
## iter 4 value -6.648726
## iter 5 value -6.656746
## iter 6 value -6.659204
## iter 7 value -6.659887
## iter 8 value -6.660151
## iter 9 value -6.660155
## iter 9 value -6.660155
## iter 9 value -6.660155
## final value -6.660155
## converged
## initial value -6.649834
## iter 2 value -6.650443
## iter 2 value -6.650443
## iter 2 value -6.650443
## final value -6.650443
## converged

## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, transform.pars = trans, fixed = fixed,
## optim.control = list(trace = trc, REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 sma1
## -0.2079 -0.6620
## s.e. 0.1291 0.1261
##
## sigma^2 estimated as 1.608e-06: log likelihood = 308.66, aic = -611.32
##
## $degrees_of_freedom
## [1] 57
##
## $ttable
## Estimate SE t.value p.value
## ar1 -0.2079 0.1291 -1.6111 0.1127
## sma1 -0.6620 0.1261 -5.2520 0.0000
##
## $AIC
## [1] -9.859961
##
## $AICc
## [1] -9.85668
##
## $BIC
## [1] -9.759435
Simulation
simm<-sim_sarima(n=64, model=list(sma=fit4$coef[1],ar= fit4$coef[2],
ma=fit4$coef[3],iorder=1, siorder=1, nseasons=4), rand.gen = rnorm)
simm_df<-data.frame(time=t1,value=simm)
p<-ggplot(simm_df,aes(x=time,y=value))+
geom_point(color='darkblue')+
geom_line(color='darkblue')+
theme_gray()+
xlab('Time') +
ylab('Value')+
ggtitle("Simulation")
fig<-ggplotly(p)
fig